library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
crime_df = readRDS(file = "datasets/nyc_felony_crimes.rds")
url = "https://www.census.gov/quickfacts/fact/table/newyorkcitynewyork,bronxcountybronxboroughnewyork,kingscountybrooklynboroughnewyork,newyorkcountymanhattanboroughnewyork,queenscountyqueensboroughnewyork,richmondcountystatenislandboroughnewyork/PST045217"
boro_pop = read_html(url)
boro_pop %>%
html_nodes(css = "table")
## {xml_nodeset (4)}
## [1] <table class="type" data-geospan="6">\n<colgroup>\n<col>\n<col>\n<co ...
## [2] <table class="type" data-geospan="6">\n<caption class="icon-user-3"> ...
## [3] <table class="type" data-geospan="6">\n<caption class="icon-industry ...
## [4] <table class="type" data-geospan="6">\n<caption class="icon-globe-2" ...
pop_df = (boro_pop %>% html_nodes(css = "table")) %>%
.[[1]] %>%
html_table() %>%
as_tibble() %>%
janitor::clean_names()
names(pop_df)[1:7] = c("estimate_date", "new_york_city", "bronx", "brooklyn", "manhattan", "queens", "staten_island")
pop_df = pop_df %>%
gather(key = boro_nm, value = population, estimate_date:staten_island) %>%
mutate(population = if_else(population == "Population estimates, July 1, 2017, (V2017)", "2017", population),
population = as.numeric(gsub("," , "", population)))
nycc_df = crime_df %>%
mutate(boro_nm = if_else(boro_nm == "staten island", "staten_island", boro_nm))
Looking at the individual offenses by borough over the four years:
crime_df %>%
distinct(pd_cd) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 221
combined_df = nycc_df %>% group_by(boro_nm, pd_cd, year) %>%
summarise(count = n())
full = left_join(combined_df, pop_df, by = "boro_nm") %>%
mutate(off_rate = ((count/population)*100))
full[,'off_rate'] = round(full[,'off_rate'],2)
test <- sqldf("SELECT boro_nm, year, off_rate
FROM full
WHERE off_rate IS NOT NULL and year = '2017'
ORDER BY boro_nm, off_rate DESC")
test_2017 <- test[!duplicated(test$boro_nm), ]
test2 <- sqldf("SELECT boro_nm, year, off_rate
FROM full
WHERE off_rate IS NOT NULL and year = '2016'
ORDER BY boro_nm, off_rate DESC")
test_2016 <- test2[!duplicated(test2$boro_nm), ]
test3 <- sqldf("SELECT boro_nm, year, off_rate
FROM full
WHERE off_rate IS NOT NULL and year = '2015'
ORDER BY boro_nm, off_rate DESC")
test_2015 <- test3[!duplicated(test3$boro_nm), ]
test4 <- sqldf("SELECT boro_nm, year, off_rate
FROM full
WHERE off_rate IS NOT NULL and year = '2014'
ORDER BY boro_nm, off_rate DESC")
test_2014 <- test4[!duplicated(test4$boro_nm), ]
test_new = rbind(test_2017, test_2016, test_2015, test_2014)
trend = ggplot(test_new, aes(x = boro_nm, y = off_rate, color = year)) +
geom_point() +
theme(legend.position = "bottom") +
labs(
x = "Borough",
y = "Offense rate",
caption = "NYC_Crime"
) +
geom_smooth(se = FALSE) +
theme(legend.position = "bottom")
ggplotly(trend)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'